# -*- coding: utf-8 -*-
"""PaperSimilInfoTech.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1wYp_EwYSWoDzay0g7Paw2udMyYQ_CoKN
"""

# Calculating similarity in different ways
# Similarity distances by the number of vertices in the taxonomy tree
# https://habr.com/ru/articles/778048/
#v.17.04.2024

# The CBOW approach stands for Continuous Bag of Words, which translates to "continuous bag of words".
from gensim import models

!wget -c "https://rzn-obr.ru/GoogleNews-vectors-negative300.bin.gz"
!gzip -d GoogleNews-vectors-negative300.bin.gz

w2v = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

vect = w2v['Shooting']
w2v.most_similar('Shooting')

vect = w2v['attack']
w2v.most_similar('attack')

word1 = 'attack'
word2 = 'Shooting'
similarity = w2v.similarity(word1, word2)

print(f'Semantic similarity between "{word1}" и "{word2}" is equal {similarity}')

word1 = 'attack'
word2 = 'bombing'
similarity = w2v.similarity(word1, word2)

print(f'Semantic similarity between "{word1}" и "{word2}" is equal {similarity}')

# A function for calculating the average vector of a phrase
import numpy as np
def average_vector(model, phrase):
    words = [word for word in phrase.split() if word in model.key_to_index]
    if not words:
        return None
    vectors = [model[word] for word in words]
    avg_vector = np.mean(vectors, axis=0)
    return avg_vector

# Calculating the semantic similarity between two phrases
phrase1 = 'первое словосочетание'
phrase2 = 'второе словосочетание'

phrase1 = 'attack attack'
phrase2 = 'venue leaves'

avg_vector1 = average_vector(w2v, phrase1)
avg_vector2 = average_vector(w2v, phrase2)

if avg_vector1 is not None and avg_vector2 is not None:
    similarity = np.dot(avg_vector1, avg_vector2) / (np.linalg.norm(avg_vector1) * np.linalg.norm(avg_vector2))
    print(f'Semantic similarity between "{phrase1}" и "{phrase2}" равно {similarity}')
else:
    print('Одно из словосочетаний содержит только слова, которых нет в словаре модели.')

# On a certain date, there was an attack on a city facility where many people became victims.
# В определенную дату  была атака на объект города где  жертвами стали  много человек.

# March 23, 2024 Shooting at Moscow concert venue leaves over 130 dead.
# 23 марта 2024 в результате стрельбы на концертной площадке в Москве погибло более 130 человек.

# sklearn
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Calculating semantic similarity of sentences
sentence1 = "On a certain date, there was an attack on a city facility where many people became victims."                                          # cnn     1                  1
sentence2 = "On March 23, 2024, terrorists attacked Moscow, killing more than 130 citizens."                                 # IYu     0.904102623462677  0.812596321105957
sentence3 = "March 23 2024 terrorists strike at a packed concert hall in the Russian capital, leaving at least 60 dead."     # RT      0.9349522590637207 0.8757287859916687
sentence4 = "On the morning of 11.09.2001, two Boston planes destroyed the World Trade Center in New York."                  # NYTimes 0.8471251726150513 0.7009884119033813
sentence5 = "On October 11, 2022, a new multifunctional medical center was opened in Lugansk."                               # TASS    0.8411691188812256 0.6638354063034058



sentences = ['On a certain date, there was an attack on a city facility where many people became victims.',
             'On March 23, 2024, terrorists attacked Moscow, killing more than 130 citizens.',
             'On October 11, 2022, a new multifunctional medical center was opened in Vladimir.']

vectorizer = CountVectorizer()
sentence_vectors = vectorizer.fit_transform(sentences)

similarity_matrix = cosine_similarity(sentence_vectors, sentence_vectors)

print(similarity_matrix)

# Жаккар
def jaccard_similarity(x,y):
  """ returns the jaccard similarity between two lists """
  intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
  union_cardinality = len(set.union(*[set(x), set(y)]))
  return intersection_cardinality/float(union_cardinality)


sentences = [
    #'On a certain date, there was an attack on a city facility where many people became victims.',
             'On March 23, 2024, terrorists attacked Moscow, killing more than 130 citizens.',
             'On October 11, 2022, a new multifunctional medical center was opened in Lugansk.']
#
sentences = [sent.lower().split(" ") for sent in sentences]
jaccard_similarity(sentences[0], sentences[1])

from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')

nltk.download('wordnet')

# A function for getting synonyms of each word in a sentence
def get_synonyms(sentence_tokens):
    synonyms = []
    for word in sentence_tokens:
        for syn in wn.synsets(word):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
    return set(synonyms)

# A function for calculating the semantic similarity of two sentences
def sentence_similarity(sentence1, sentence2):
    tokens1 = word_tokenize(sentence1)
    tokens2 = word_tokenize(sentence2)

    synonyms1 = get_synonyms(tokens1)
    synonyms2 = get_synonyms(tokens2)

    score = 0.0
    count = 0

    for word1 in synonyms1:
        best_score = 0.0
        for word2 in synonyms2:
            sim = wn.synsets(word1)[0].wup_similarity(wn.synsets(word2)[0])
            if sim is not None and sim > best_score:
                best_score = sim
        score += best_score
        count += 1

    if count == 0:
        return 0.0

    return score / count

# An example of two sentences for calculating semantic similarity

similarity_score = sentence_similarity(sentence1, sentence2)
print(f'An example of two sentences for calculating semantic similarity: {similarity_score}')

#!pip install nltk
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')
def lesk_similarity(word1, word2):
    synset1 = wordnet.synsets(word1)
    synset2 = wordnet.synsets(word2)

    max_similarity = -1

    for sense1 in synset1:
        for sense2 in synset2:
            similarity = sense1.wup_similarity(sense2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity

    return max_similarity

word1 = 'awareness' # Осведомление
word2 = 'informing'
similarity = lesk_similarity(word1, word2)
print(f"The similarity between '{word1}' and '{word2}' is: {similarity}")